library(edgeR)
library(ggplot2)
library(readr)
library(here)

# Create output directories if they don't exist
dir.create(file.path("edgeR_transcript_prot"), showWarnings = FALSE, recursive = TRUE)
dir.create(file.path("edgeR_gene_prot"), showWarnings = FALSE, recursive = TRUE)

Transcripts

Load count data - Transcripts

counts <- read.csv("/Volumes/sheynkman/projects/LRP_Mohi_project/06_refine_orf_database/protein_counts_matrix.csv", 
                   header = TRUE, row.names = 1)

group <- factor(c("Q157R", "Q157R", "Q157R", "WT", "WT", "WT"))

dge_raw <- DGEList(counts = counts, group = group)

Visualize raw counts

boxplot(dge_raw$counts, main = "Boxplot of Raw Counts", las = 2, col = as.numeric(group))

plotMDS(dge_raw, col = as.numeric(group), main = "MDS Plot for Raw Counts")

plotMD(dge_raw, col = as.numeric(group), main = "MD Plot for Raw Counts")
abline(h = 0, col = "red", lty = 2, lwd = 2)

MD (mean-difference) plots per sample

par(mfrow = c(2, 3), mar = c(4, 4, 2, 1))
for (i in 1:ncol(dge_raw$counts)) {
  plotMD(dge_raw, column = i, main = paste("MD Plot for Sample", i))
  abline(h = 0, col = "red", lty = 2, lwd = 2)
}

par(mfrow = c(1, 1)) # Reset plotting area to single panel

Filter lowly expressed transcripts (required for modeling)

keep <- filterByExpr(dge_raw)
table(keep)
## keep
##  FALSE   TRUE 
## 144622  27450
dge <- dge_raw[keep, , keep.lib.sizes = FALSE]

Normalize Counts with TMM

dge <- calcNormFactors(dge, method = "TMM")

Save counts matrices

# Save dge_raw and dge_norm objects for downstream correlation analyses
save(dge_raw, file = file.path("edgeR_transcript_prot", "dte_raw.rda"))
save(dge,      file = file.path("edgeR_transcript_prot", "dte_norm.rda"))

Visualize normalized counts

boxplot(dge$counts, main = "Boxplot of Normalized Counts")

Log-transform and boxplot: Protein-level Transcript CPM

log_cpm_tx <- cpm(dge, log = TRUE, prior.count = 1)

boxplot(log_cpm_tx,
        main = "Boxplot of log2(CPM + 1) - Protein Transcripts",
        las = 2,
        col = as.numeric(group))

, las = 2, col = as.numeric(group)) plotMDS(dge, col = as.numeric(group), main = “MDS Plot for Normalized Counts”) plotMD(dge, col = as.numeric(group), main = “MD Plot for Normalized Counts”) abline(h = 0, col = “red”, lty = 2, lwd = 2)


Design matrix and dispersion

``` r
design <- model.matrix(~ group)
rownames(design) <- colnames(dge)

dge_disp <- estimateDisp(dge, design)
dge_disp$common.dispersion
## [1] 0.06947344
plotBCV(dge_disp)

Fit model and test

fit <- glmQLFit(dge_disp, design)
plotQLDisp(fit)

result <- glmQLFTest(fit, coef = 2)
topTags(result)
## Coefficient:  groupWT 
##                logFC   logCPM         F       PValue        FDR
## PB.21599.2  2.587828 4.864370 120.55023 4.157448e-07 0.01076015
## PB.17614.3  5.731934 1.879437  69.84691 9.345351e-07 0.01076015
## PB.9626.1   2.410092 4.466302  93.26450 1.439576e-06 0.01076015
## PB.18006.1  2.756940 4.316899  88.57754 1.842696e-06 0.01076015
## PB.20180.4  2.182769 4.491793  83.07070 2.496579e-06 0.01076015
## PB.16163.1  2.284374 5.637514  82.26567 2.612815e-06 0.01076015
## PB.11107.2  2.302415 4.344162  81.43147 2.743936e-06 0.01076015
## PB.11890.1  2.253413 4.974258  75.70609 3.863836e-06 0.01223914
## PB.2313.1   2.056590 7.686002  74.86852 4.068739e-06 0.01223914
## PB.24052.20 8.827075 2.250527 103.86255 4.468137e-06 0.01223914

FDR and DEG Summary

FDR <- p.adjust(result$table$PValue, method = "BH")
sum(FDR < 0.05)
## [1] 93
qlf <- glmQLFTest(fit)
top <- rownames(topTags(qlf))
cpm(dge_disp)[top, ]
##             BioSample_1 BioSample_2 BioSample_3 BioSample_4 BioSample_5
## PB.21599.2     7.822002    7.576997   9.1346262   41.660344   59.259545
## PB.17614.3     0.000000    0.000000   0.3044875    5.252826    6.452706
## PB.9626.1      8.318638    6.908438   5.3285320   36.045254   33.053657
## PB.18006.1     4.966351    4.234204   5.7852633   30.611296   45.695693
## PB.20180.4     7.201209    8.468408   8.3734074   28.981109   39.506363
## PB.16163.1    17.382228   11.588348  21.1618841   68.467870   94.420208
## PB.11107.2     6.952891    4.902763   8.0689199   28.981109   36.477542
## PB.11890.1    11.670924    8.022703  12.3317454   41.116949   49.119578
## PB.2313.1     91.380854   62.175945  85.1042679  231.124345  344.758862
## PB.24052.20    0.000000    0.000000   0.0000000    5.977354    8.296336
##             BioSample_6
## PB.21599.2    46.996000
## PB.17614.3     7.992517
## PB.9626.1     40.921687
## PB.18006.1    25.895755
## PB.20180.4    40.282286
## PB.16163.1    83.282028
## PB.11107.2    34.207973
## PB.11890.1    64.099987
## PB.2313.1    418.648043
## PB.24052.20   12.148626
summary(decideTests(qlf))
##        groupWT
## Down         6
## NotSig   27357
## Up          87
plotMD(qlf)
abline(h = c(-1, 1), col = "blue")

Save DEG results & intermediate files

deg_results <- topTags(result, n = Inf)$table
write.table(deg_results, 
            file = file.path("edgeR_transcript_prot", "transcript_DEG_results.txt"), 
            sep = "\t", quote = FALSE, row.names = TRUE)

# Raw and normalized CPM Matrices
write.table(cpm(dge_raw), 
            file = file.path("edgeR_transcript_prot", "raw_CPM_matrix.txt"), 
            sep = "\t", quote = FALSE)
write.table(cpm(dge), 
            file = file.path("edgeR_transcript_prot", "normalized_CPM_matrix.txt"), 
            sep = "\t", quote = FALSE)

# List of filtered-in transcripts
write.table(rownames(dge), 
            file = file.path("edgeR_transcript_prot", "filtered_transcripts.txt"), 
            quote = FALSE, row.names = FALSE, col.names = FALSE)

# Design matrix to keep record of the model used 
write.table(design, 
            file = file.path("edgeR_transcript_prot", "design_matrix.txt"), 
            sep = "\t", quote = FALSE, row.names = TRUE, col.names = NA)

# Top genes table 
top_genes <- topTags(qlf, n = 50)$table
write.table(top_genes, 
            file = file.path("edgeR_transcript_prot", "top_transcripts.txt"), 
            sep = "\t", quote = FALSE, row.names = TRUE)

# Save dispersion plots (BCV and QL)
png(file.path("edgeR_transcript_prot", "BCV_plot.png"), width = 800, height = 600)
plotBCV(dge_disp)
dev.off()
## quartz_off_screen 
##                 2
png(file.path("edgeR_transcript_prot", "QLDisp_plot.png"), width = 800, height = 600)
plotQLDisp(fit)
dev.off()
## quartz_off_screen 
##                 2
# MDS plot (normalized)
png(file.path("edgeR_transcript_prot", "MDS_plot.png"), width = 800, height = 600)
plotMDS(dge, col = as.numeric(group), main = "MDS Plot for Normalized Counts")
dev.off()
## quartz_off_screen 
##                 2

Volcano Plot - left = downregulated in Q157R vs. WT (higher in WT); right = up-regulated in Q157R vs. WT (higher in Q157R)

deg_results$Gene <- rownames(deg_results)

ggplot(deg_results, aes(x = logFC, y = -log10(PValue))) + 
  geom_point(aes(color = FDR < 0.05)) + 
  scale_color_manual(values = c("black", "red")) + 
  theme_minimal() + 
  labs(title = "Volcano Plot", x = "Log2 Fold Change", y = "-Log10 P-value")

ggsave(file.path("edgeR_transcript_prot", "transcript_volcano_plot.png"), 
       width = 6, height = 6, dpi = 300)

Interactive volcano plot

library(ggiraph)
library(plotly)

gg <- ggplot(deg_results, aes(x = logFC, y = -log10(PValue),
                              tooltip = Gene, data_id = Gene)) +
  geom_point_interactive(aes(color = FDR < 0.05)) +
  scale_color_manual(values = c("black", "red")) +
  theme_minimal() +
  labs(title = "Interactive Volcano Plot", x = "Log2 Fold Change", y = "-Log10 P-value")

girafe(ggobj = gg, width_svg = 10, height_svg = 6)

Save count matrices

# Transcript raw counts
write.table(dge_raw$counts, 
            file.path("edgeR_transcript_prot", "raw_counts_transcripts.txt"), 
            sep = "\t", quote = FALSE)

# Transcript normalized counts
write.table(cpm(dge), 
            file.path("edgeR_transcript_prot", "normalized_counts_transcripts.txt"), 
            sep = "\t", quote = FALSE)

Genes

counts_gene <- read.delim("/Volumes/sheynkman/projects/LRP_Mohi_project/19_LRP_summary/protein/raw_protein_gene_counts_matrix.txt", 
                          header = TRUE, sep = "\t", stringsAsFactors = FALSE)

# Use 'gene_id' as rownames
rownames(counts_gene) <- counts_gene$gene_id
counts_gene$gene_id <- NULL

counts <- as.data.frame(counts_gene)

group <- factor(c("Q157R", "Q157R", "Q157R", "WT", "WT", "WT"))
dge_raw <- DGEList(counts = counts, group = group)

Visualize raw counts - Genes

boxplot(dge_raw$counts, main = "Boxplot of Raw Gene Counts", las = 2, col = as.numeric(group))

plotMDS(dge_raw, col = as.numeric(group), main = "MDS Plot for Raw Gene Counts")

plotMD(dge_raw, col = as.numeric(group), main = "MD Plot for Raw Gene Counts")
abline(h = 0, col = "red", lty = 2, lwd = 2)

MD plots per sample - Genes

par(mfrow = c(2, 3), mar = c(4, 4, 2, 1))
for (i in 1:ncol(dge_raw$counts)) {
  plotMD(dge_raw, column = i, main = paste("MD Plot for Sample", i))
  abline(h = 0, col = "red", lty = 2, lwd = 2)
}

par(mfrow = c(1, 1))

Filter lowly expressed genes

keep <- filterByExpr(dge_raw)
table(keep)
## keep
## FALSE  TRUE 
##  2539  9886
dge <- dge_raw[keep, , keep.lib.sizes = FALSE]

Normalize Counts with TMM - Genes

dge <- calcNormFactors(dge, method = "TMM")

Save counts matrices - Genes

# Save dge_raw and dge_norm objects for downstream correlation analyses
save(dge_raw, file = file.path("edgeR_gene_prot", "dge_raw.rda"))
save(dge,      file = file.path("edgeR_gene_prot", "dge_norm.rda"))

Visualize normalized counts - Genes

boxplot(dge$counts, main = "Boxplot of Normalized Gene Counts")

## Log-transform and boxplot: Protein-level Gene CPM

log_cpm_gene <- cpm(dge, log = TRUE, prior.count = 1)

boxplot(log_cpm_gene,
        main = "Boxplot of log2(CPM + 1) - Protein Genes",
        las = 2,
        col = as.numeric(group))

, las = 2, col = as.numeric(group)) plotMDS(dge, col = as.numeric(group), main = “MDS Plot for Normalized Gene Counts”) plotMD(dge, col = as.numeric(group), main = “MD Plot for Normalized Gene Counts”) abline(h = 0, col = “red”, lty = 2, lwd = 2)


Design matrix and dispersion - Genes

``` r
design <- model.matrix(~ group)
rownames(design) <- colnames(dge)

dge_disp <- estimateDisp(dge, design)
dge_disp$common.dispersion
## [1] 0.05164303
plotBCV(dge_disp)

Fit model and test - Genes

fit <- glmQLFit(dge_disp, design)
plotQLDisp(fit)

result <- glmQLFTest(fit, coef = 2)
topTags(result)
## Coefficient:  groupWT 
##             logFC   logCPM        F       PValue        FDR
## PB.11107 1.825258 4.524892 98.13461 6.479355e-06 0.02571292
## PB.3791  2.265128 3.983362 89.44119 9.304025e-06 0.02571292
## PB.16163 2.210740 5.610552 89.21069 9.376582e-06 0.02571292
## PB.11890 1.915878 5.132346 82.72761 1.254603e-05 0.02571292
## PB.21599 1.643707 5.148811 72.40887 2.087808e-05 0.02571292
## PB.18006 2.115223 4.537860 71.00810 2.250012e-05 0.02571292
## PB.2313  2.024386 7.653194 69.98660 2.374787e-05 0.02571292
## PB.10619 1.607534 4.731096 69.24039 2.474488e-05 0.02571292
## PB.7848  2.263690 3.813640 67.55444 2.722135e-05 0.02571292
## PB.14007 2.107100 5.351992 66.39023 2.899683e-05 0.02571292

FDR and DEG Summary - Genes

FDR <- p.adjust(result$table$PValue, method = "BH")
sum(FDR < 0.05)
## [1] 64
qlf <- glmQLFTest(fit)
top <- rownames(topTags(qlf))
cpm(dge_disp)[top, ]
##          BioSample_1 BioSample_2 BioSample_3 BioSample_4 BioSample_5
## PB.11107    9.857822    9.445224   10.549382    31.42123    38.63298
## PB.3791     4.502956    4.393127    6.936580    23.47960    25.41306
## PB.16163   18.376928   11.861444   20.809741    66.46799    92.79617
## PB.11890   15.821196   12.959726   14.595721    44.36954    52.75135
## PB.21599   17.281614   15.595602   18.064011    44.36954    63.66100
## PB.18006    9.249315    8.127286    8.237189    32.62974    49.15759
## PB.2313    92.736548   63.261034   81.215794   222.53828   339.48254
## PB.10619   14.117375   11.641788   12.861576    35.04676    37.22115
## PB.7848     4.624657    4.612784    4.913411    17.43706    29.90527
## PB.14007   18.742032   11.202475   15.173769    49.37622    65.20118
##          BioSample_6
## PB.11107    36.00306
## PB.3791     28.00238
## PB.16163    80.31452
## PB.11890    67.69806
## PB.21599    51.54284
## PB.18006    29.54097
## PB.2313    406.03452
## PB.10619    46.31163
## PB.7848     20.46328
## PB.14007    82.93013
summary(decideTests(qlf))
##        groupWT
## Down         2
## NotSig    9822
## Up          62
plotMD(qlf)
abline(h = c(-1, 1), col = "blue")

Save DEG results & intermediate files - Genes

deg_results <- topTags(result, n = Inf)$table
write.table(deg_results, 
            file.path("edgeR_gene_prot", "gene_DEG_results.txt"), 
            sep = "\t", quote = FALSE, row.names = TRUE)

write.table(cpm(dge_raw), 
            file.path("edgeR_gene_prot", "raw_CPM_matrix_gene.txt"), 
            sep = "\t", quote = FALSE)
write.table(cpm(dge), 
            file.path("edgeR_gene_prot", "normalized_CPM_matrix_gene.txt"), 
            sep = "\t", quote = FALSE)
write.table(rownames(dge), 
            file.path("edgeR_gene_prot", "filtered_genes.txt"), 
            quote = FALSE, row.names = FALSE, col.names = FALSE)
write.table(design, 
            file.path("edgeR_gene_prot", "design_matrix_gene.txt"), 
            sep = "\t", quote = FALSE, row.names = TRUE, col.names = NA)

top_genes <- topTags(qlf, n = 50)$table
write.table(top_genes, 
            file.path("edgeR_gene_prot", "top_genes.txt"), 
            sep = "\t", quote = FALSE, row.names = TRUE)

png(file.path("edgeR_gene_prot", "BCV_plot_gene.png"), width = 800, height = 600)
plotBCV(dge_disp)
dev.off()
## quartz_off_screen 
##                 2
png(file.path("edgeR_gene_prot", "QLDisp_plot_gene.png"), width = 800, height = 600)
plotQLDisp(fit)
dev.off()
## quartz_off_screen 
##                 2
png(file.path("edgeR_gene_prot", "MDS_plot_gene.png"), width = 800, height = 600)
plotMDS(dge, col = as.numeric(group), main = "MDS Plot (Normalized Genes)")
dev.off()
## quartz_off_screen 
##                 2

Volcano Plot - Genes

deg_results$Gene <- rownames(deg_results)

ggplot(deg_results, aes(x = logFC, y = -log10(PValue))) + 
  geom_point(aes(color = FDR < 0.05)) + 
  scale_color_manual(values = c("black", "red")) + 
  theme_minimal() + 
  labs(title = "Gene Volcano Plot", x = "Log2 Fold Change", y = "-Log10 P-value")

ggsave(file.path("edgeR_gene_prot", "gene_volcano_plot.png"), 
       width = 6, height = 6, dpi = 300)

Interactive volcano plot - Genes

gg <- ggplot(deg_results, aes(x = logFC, y = -log10(PValue),
                              tooltip = Gene, data_id = Gene)) +
  geom_point_interactive(aes(color = FDR < 0.05)) +
  scale_color_manual(values = c("black", "red")) +
  theme_minimal() +
  labs(title = "Interactive Gene Volcano Plot", x = "Log2 Fold Change", y = "-Log10 P-value")

girafe(ggobj = gg, width_svg = 10, height_svg = 6)

Save count matrices

# Gene raw counts
write.table(dge_raw$counts, 
            file.path("edgeR_gene_prot", "raw_counts_genes.txt"), 
            sep = "\t", quote = FALSE)

# Gene normalized counts
write.table(cpm(dge), 
            file.path("edgeR_gene_prot", "normalized_counts_genes.txt"), 
            sep = "\t", quote = FALSE)